Catherine Heller Jeremy Hess Rudra Menon

Final Project CMSC320 2019

library(magrittr)
library(rvest)
## Loading required package: xml2
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.0.1     ✔ purrr   0.3.0
## ✔ readr   1.3.1     ✔ stringr 1.3.1
## ✔ tibble  2.0.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()        masks magrittr::extract()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()
## ✖ purrr::set_names()      masks magrittr::set_names()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
stats <- read_csv("SpaceBastards-stats.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Date/Time` = col_datetime(format = ""),
##   `Point Elapsed Seconds` = col_double(),
##   `Our Score - End of Point` = col_double(),
##   `Their Score - End of Point` = col_double(),
##   `Hang Time (secs)` = col_double(),
##   `Player 7` = col_logical(),
##   `Player 8` = col_logical(),
##   `Player 9` = col_logical(),
##   `Player 10` = col_logical(),
##   `Player 11` = col_logical(),
##   `Player 12` = col_logical(),
##   `Player 13` = col_logical(),
##   `Player 14` = col_logical(),
##   `Player 15` = col_logical(),
##   `Player 16` = col_logical(),
##   `Player 17` = col_logical(),
##   `Player 18` = col_logical(),
##   `Player 19` = col_logical(),
##   `Player 20` = col_logical(),
##   `Player 21` = col_logical()
##   # ... with 7 more columns
## )
## See spec(...) for full column specifications.
## Warning: 4838 parsing failures.
## row col   expected     actual                      file
##   1  -- 52 columns 42 columns 'SpaceBastards-stats.csv'
##   2  -- 52 columns 42 columns 'SpaceBastards-stats.csv'
##   3  -- 52 columns 42 columns 'SpaceBastards-stats.csv'
##   4  -- 52 columns 42 columns 'SpaceBastards-stats.csv'
##   5  -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## ... ... .......... .......... .........................
## See problems(...) for more details.
stats <- stats %>%
  select("Date/Time", tournament = "Tournamemnt", opponent = "Opponent", time = "Point Elapsed Seconds", "Line", 
         ourscore = "Our Score - End of Point", theirscore = "Their Score - End of Point", "Event Type", "Action", 
         "Passer", rec = "Receiver", "Defender", p0 = "Player 0", p1 = "Player 1", p2 = "Player 2", p3 = "Player 3", p4 = "Player 4",
         p5 = "Player 5", p6 = "Player 6")

stats
## # A tibble: 4,827 x 19
##    `Date/Time`         tournament opponent  time Line  ourscore theirscore
##    <dttm>              <chr>      <chr>    <dbl> <chr>    <dbl>      <dbl>
##  1 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  2 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  3 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  4 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  5 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  6 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  7 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  8 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
##  9 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
## 10 2019-04-01 00:09:00 Easterns   Northea…   229 O            1          0
## # … with 4,817 more rows, and 12 more variables: `Event Type` <chr>,
## #   Action <chr>, Passer <chr>, rec <chr>, Defender <chr>, p0 <chr>,
## #   p1 <chr>, p2 <chr>, p3 <chr>, p4 <chr>, p5 <chr>, p6 <chr>
plusminus <- stats 
# if Action is drop, make player the receiver, if there is no passer, make it the defender
plusminus$Player <- ifelse(is.na(plusminus$Passer), plusminus$Defender,
                           ifelse(plusminus$Action == "Drop", plusminus$rec, plusminus$Passer))

# add state for 
plusminus <- plusminus %>% rbind(plusminus %>%
                                   filter(Action=="Goal") %>%
                                   mutate(Action="Score", Player=rec))
# count actions per player
plusminus <- plusminus %>%
  select(Player, Action, rec) %>%
  group_by(Player) %>%
  count(Action) %>%
  select(Player, Action, n)

rows <- dim(plusminus)[1]; plusminus <- plusminus[1:(rows - 6),] #Remove last 6 entries, not useful

# reshape dataframe and remove "0" and "Anonymous
plusminus <- plusminus%>% 
  group_by(Player) %>%
  spread(key="Action",value=n) %>% 
  filter(Player != "0" &  Player != "Anonymous")
plusminus[is.na(plusminus)] <- 0

# add column for plu_minus
plusminus$plus_minus <- plusminus$Callahan + plusminus$D + plusminus$Goal+ plusminus$Score - plusminus$Stall - plusminus$Throwaway - plusminus$Drop

plusminus
## # A tibble: 31 x 12
## # Groups:   Player [31]
##    Player Callahan Catch     D  Drop  Goal  Pull PullOb Score Stall
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1 Aaron         0     8     1     4     0     0      0     8     0
##  2 Adam W        0   410     4     5    20     1      1     4     0
##  3 Ari           0    69     6     8    11     0      0    27     0
##  4 Austin        0    16     6     2     1     0      0     3     0
##  5 Baugh         0    17    15     0     0     0      0    14     0
##  6 Berg          0    90     3     2     7     0      0     2     0
##  7 Boots         0   173    31     8    58    31     14    63     0
##  8 Brand…        0     4     3     2     0     0      0     0     0
##  9 Colin         0    45     5     3     0     0      0     8     0
## 10 DFB           0   384     5    11    18    32      5     5     1
## # … with 21 more rows, and 2 more variables: Throwaway <dbl>,
## #   plus_minus <dbl>
#calculate passing percentage
passing<- plusminus %>% select(Player, Completions= "Catch", Assists="Goal", "Throwaway", "Stall")
passing$pass_perc <- (passing$Completions + passing$Assists)/ (passing$Completions + passing$Assists + passing$Throwaway + passing$Stall) * 100
passing
## # A tibble: 31 x 6
## # Groups:   Player [31]
##    Player  Completions Assists Throwaway Stall pass_perc
##    <chr>         <dbl>   <dbl>     <dbl> <dbl>     <dbl>
##  1 Aaron             8       0         0     0     100  
##  2 Adam W          410      20        20     0      95.6
##  3 Ari              69      11         7     0      92.0
##  4 Austin           16       1         7     0      70.8
##  5 Baugh            17       0         0     0     100  
##  6 Berg             90       7         5     0      95.1
##  7 Boots           173      58        29     0      88.8
##  8 Brandon           4       0         1     0      80  
##  9 Colin            45       0         1     0      97.8
## 10 DFB             384      18        22     1      94.6
## # … with 21 more rows
# group by point (differentiated by tournament, game, and score)
points <- stats %>%
  select(tournament, opponent, ourscore, theirscore, p0, p1, p2, p3, p4, p5, p6) %>%
  group_by(tournament, opponent, ourscore, theirscore, p0, p1, p2, p3, p4, p5 ,p6) %>%
  count()

# count appearances of each name
points <-points %>% 
  ungroup() %>% 
  select(p0, p1, p2, p3, p4, p5 ,p6)

points_played <- as.data.frame(table(unlist(points)))
points_played
##        Var1 Freq
## 1     Aaron   59
## 2    Adam W  177
## 3       Ari  213
## 4    Austin   63
## 5     Baugh  126
## 6      Berg  155
## 7     Boots  307
## 8   Brandon   29
## 9     Colin  134
## 10      DFB  223
## 11     Drew   11
## 12    Grant  219
## 13 Greenlee   30
## 14     Jack   63
## 15   Jeremy  186
## 16    Jimmy   63
## 17      Jip   35
## 18     Joel  193
## 19   Johnny  132
## 20     Luke  175
## 21    Mason  143
## 22 Matt Joy   52
## 23  Michael   15
## 24    Moose  131
## 25     Paul   30
## 26     Rudy  340
## 27     Ryan   60
## 28   Sheedy   42
## 29    Steve  171
## 30     Theo   46
## 31     Will  137
plot1 <- plusminus
plot1$points_played <- points_played$Freq
plot1 <- plot1%>%
  group_by(Player) %>%
  summarise_all(funs(first(na.omit(.)))) %>%
  select(Player, plus_minus, points_played)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
plot1$per_point <- plot1$plus_minus/ plot1$points_played
plot <- plot1 %>% ggplot(mapping = aes(label=Player, x=per_point, y=points_played)) +
  geom_point()+ 
  geom_smooth(method=lm) 


ggplotly(plot, tooltip = c("Player", "per_point","points_played"))
plot1$prediction <- predict(loess(points_played~per_point,plot1), plot1$per_point)
plot1$diff<- abs(plot1$prediction - plot1$points_played)
plot1$hyp <- plot1$diff < 50

plot1
## # A tibble: 31 x 7
##    Player  plus_minus points_played per_point prediction   diff hyp  
##    <chr>        <dbl>         <int>     <dbl>      <dbl>  <dbl> <lgl>
##  1 Aaron            5            59    0.0847       95.1  36.1  TRUE 
##  2 Adam W           3           177    0.0169       82.3  94.7  FALSE
##  3 Ari             29           213    0.136       172.   40.5  TRUE 
##  4 Austin           1            63    0.0159       82.8  19.8  TRUE 
##  5 Baugh           29           126    0.230       120.    6.40 TRUE 
##  6 Berg             5           155    0.0323       77.9  77.1  FALSE
##  7 Boots          115           307    0.375       301.    5.66 TRUE 
##  8 Brandon          0            29    0            91.3  62.3  FALSE
##  9 Colin            9           134    0.0672       74.0  60.0  FALSE
## 10 DFB             -6           223   -0.0269      110.  113.   FALSE
## # … with 21 more rows
n <- 31
pa <- 0.5 
ex <- pa
var_x <- pa * ( 1 - pa) / n
mean <- length(plot1$hyp[plot1$hyp==TRUE])/n

std <- (sqrt(var_x))
p_value <- 1-pnorm(mean,ex,std)
p_value
## [1] 0.8154143
plot <- plot1 %>% ggplot(mapping = aes(label=Player, x=plus_minus, y=points_played)) +
  geom_point()+ 
  geom_smooth(method=lm) 


ggplotly(plot, tooltip = c("Player", "plus_minus","points_played"))
pass_perc <- passing
pass_perc$points_played <- points_played$Freq
pass_perc <- pass_perc%>%
  summarise_all(funs(first(na.omit(.)))) %>%
  select(Player, pass_perc, points_played)
pass_perc
## # A tibble: 31 x 3
##    Player  pass_perc points_played
##    <chr>       <dbl>         <int>
##  1 Aaron       100              59
##  2 Adam W       95.6           177
##  3 Ari          92.0           213
##  4 Austin       70.8            63
##  5 Baugh       100             126
##  6 Berg         95.1           155
##  7 Boots        88.8           307
##  8 Brandon      80              29
##  9 Colin        97.8           134
## 10 DFB          94.6           223
## # … with 21 more rows
plot <- pass_perc %>% ggplot(mapping = aes(label=Player, x=pass_perc, y=points_played)) +
  geom_point()+ 
  geom_smooth(method=lm) 


ggplotly(plot, tooltip = c("Player"))